{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 准备工作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "阿里研究院\n",
    "阿里健康\n",
    "阿里巴巴商学院\n",
    "阿里数据\n",
    "\n",
    "腾讯金融科技\n",
    "腾讯研究院\n",
    "腾讯媒体研究院\n",
    "腾讯云启研究院\n",
    "酷鹅用户研究院\n",
    "'''\n",
    "公众号 = \"厦门魔法阿嚒\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "fn = { \"output\" : { \"公众号_htm_snippets\": \"data_raw_src/公众号_htm_snippets_{公众号}.tsv\",\n",
    "                    \"公众号_df\": \"data_raw_src/公众号_df_{公众号}.tsv\",\n",
    "                    \"公众号_xlsx\": \"data_sets/公众号_url_{公众号}.xlsx\" } \\\n",
    "      }"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 采集公众号（requests）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "5\n",
      "10\n",
      "15\n",
      "20\n"
     ]
    }
   ],
   "source": [
    "# 目标url\n",
    "\n",
    "import time\n",
    "import requests\n",
    "import pandas as pd\n",
    "import csv\n",
    "\n",
    "\n",
    "url = \"https://mp.weixin.qq.com/cgi-bin/appmsg\"\n",
    "\n",
    "# 使用Cookie，跳过登陆操作\n",
    "headers = {\n",
    "  \"Cookie\": \"RK=rOLxcn5lEs; ptcz=a16a0fe4d539c7e8f19fc79cd0aa6d4c385390b18094c05bec2957df7d659373; ua_id=BuhIXGTdT4p67Ur4AAAAACWKrVPibjH97jZn3hlum_Y=; mm_lang=zh_CN; pgv_pvi=2624073728; pgv_pvid=1791612570; ts_uid=348923712; tvfe_boss_uuid=a0f19db8596c2b94; o_cookie=1051623960; wxuin=69343130600758; ptui_loginuin=1051623960@qq.com; pgv_si=s2371176448; uuid=dab310ce40f495ac4614e798c9d6a76a; rand_info=CAESIGtCuImrBgPh3xSFFciBCEn46O+0LCgArUAY04mEgt8G; slave_bizuin=3885036492; data_bizuin=3885036492; bizuin=3885036492; data_ticket=tzmZN9wojGAmx4UwxeaImlq7H239P9CXlZFiFEi/1LWejLK4/7w/5jSUy7YL5Jty; slave_sid=MnlyM1BpZVRvc2hFb2x1WDZTdmlGNWhuTFlvNjNLZ0g1QzdzU1dYVHMxSE96YjhFRlhVTTdtcXpLR2o0QWZIbG5ZV1RwNVlXS3FqZ0pVeDFmcTM3ZF9Wek5WeEZmY0pEeFhsSXRVQklmeUJlcVlOVDlZbWdZSTRyZVkzWjZ5R0lOWVIxM0dKeGZ6TGF2WWYz; slave_user=gh_034e7f1e480d; xid=cacb353e0fb623e95aab2386d6ff0e56; openid2ticket_ozPOQ59FqrUd8sJh9dW8w_3cfnCA=X7MJpaPg/Aa8AynDlgb7ve3PUdWBse9hwyagzPMx7LI=\",\n",
    "  \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36\"}\n",
    "\n",
    "data = {\n",
    "    \"token\": \"981752376\",\n",
    "    \"lang\": \"zh_CN\",\n",
    "    \"f\": \"json\",\n",
    "    \"ajax\": \"1\",\n",
    "    \"action\": \"list_ex\",\n",
    "    \"begin\": \"0\",\n",
    "    \"count\": \"5\",\n",
    "    \"query\": \"\",\n",
    "    \"fakeid\": \"MjM5MjY2MzEyMA==\",\n",
    "    \"type\": \"9\",\n",
    "}\n",
    "\n",
    "\n",
    "\n",
    "content_list=[]\n",
    "\n",
    "for i in range(5):\n",
    "    data[\"begin\"] = i*5\n",
    "    print(data[\"begin\"])\n",
    "    time.sleep(4)\n",
    "    # 使用get方法进行提交\n",
    "    content_json = requests.get(url, headers=headers, params=data).json()\n",
    "#     print(content_json)\n",
    "    # 返回了一个json，里面是每一页的数据\n",
    "    for item in content_json[\"app_msg_list\"]:\n",
    "    # 提取每页文章的标题及对应的url\n",
    "        items = []\n",
    "        items.append(item[\"title\"])\n",
    "        items.append(item[\"link\"])\n",
    "        items.append(item[\"create_time\"])\n",
    "        content_list.append(items)\n",
    "\n",
    "\n",
    "name=['title','link','create_time']\n",
    "test=pd.DataFrame(columns=name,data=content_list)\n",
    "with pd.ExcelWriter(fn[\"output\"][\"公众号_xlsx\"].format(公众号=\"魔法阿嚒_requests\")) as writer:\n",
    "    test.to_excel(writer)\n",
    "\n",
    "# test.to_csv(\"../微信公众号爬虫_zhichao/南方周末.csv\",mode='a',encoding='utf-8')\n",
    "# print(\"保存成功\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 采集公众号（selenium）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from lxml.html import fromstring\n",
    "import time\n",
    "from random import random\n",
    "\n",
    "# when selenium main_content is used\n",
    "# Parses an HTML document from a string constant.  Returns the root nood\n",
    "# root = fromstring(df.loc[1,\"html_snippets\"]) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 使用Selenium\n",
    "* 要更改 opts.binary_location 至自己本地的Chrome浏览器，建议portable\n",
    "* Chrome浏览器 和 chromedriver.exe要同版本号到小数后一位\n",
    "* 要确保可以 开启浏览器机器人\n",
    "* 要确保浏览器机器人 可以打开网页 driver.get(\"https://mp.weixin.qq.com\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Anaconda\\lib\\site-packages\\ipykernel_launcher.py:18: DeprecationWarning: use options instead of chrome_options\n"
     ]
    }
   ],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "\n",
    "#caps=dict()\n",
    "#caps[\"pageLoadStrategy\"] = \"none\"   # Do not wait for full page load\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "#opts.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度\n",
    "#opts.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败\n",
    "\n",
    "opts.binary_location = r\"C:\\Users\\10516\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe\" #\"H:\\_coding_\\Gitee\\InternetNewMedia\\CapstonePrj2016\\chromedriver.exe\"  \n",
    "\n",
    "# \"H:\\_coding_\\Gitee\\InternetNewMedia\\CapstonePrj2016\\chromedriver.exe\"  \n",
    "driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(\"https://mp.weixin.qq.com\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 填表登入"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "selenium 的定位方法\n",
    "* find_element_by_id &ensp;&ensp;&ensp;  根据标签id定位\n",
    "* find_element_by_name   &ensp;&ensp;&ensp; 根据标签的name定位\n",
    "* find_element_by_xpath  &ensp;&ensp;&ensp; 根据xpath定位\n",
    "* find_element_by_link_text  &ensp;&ensp;&ensp; 通过文字链接来定位元素\n",
    "* find_element_by_partial_link_text  &ensp;&ensp;&ensp;  通过文字链接来定位元素\n",
    "* find_element_by_tag_name  &ensp;&ensp;&ensp;  根据标签的名字定位\n",
    "* find_element_by_class_name  &ensp;&ensp;&ensp; 通过class name 定位\n",
    "* find_element_by_css_selector  &ensp;&ensp;&ensp;  根据元素属性来定位"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "ename": "ElementNotInteractableException",
     "evalue": "Message: element not interactable\n  (Session info: chrome=81.0.4044.138)\n",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mElementNotInteractableException\u001b[0m           Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-9-89b76106de98>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[0mpayload\u001b[0m \u001b[1;33m=\u001b[0m  \u001b[1;33m{\u001b[0m\u001b[1;34m\"account\"\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m\"3347149824@qq.com\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"password\"\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m\"sl520ykYk\"\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;31m# payload =  {\"account\": \"NFUHacks@163.com\", \"password\": \"NFU706947580\"}\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_element_by_xpath\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'//div[@class=\"login__type__container login__type__container__scan\"]/a'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclick\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32mD:\\Anaconda\\lib\\site-packages\\selenium\\webdriver\\remote\\webelement.py\u001b[0m in \u001b[0;36mclick\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m     78\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mclick\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     79\u001b[0m         \u001b[1;34m\"\"\"Clicks the element.\"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 80\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_execute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mCommand\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mCLICK_ELEMENT\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     81\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     82\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0msubmit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\lib\\site-packages\\selenium\\webdriver\\remote\\webelement.py\u001b[0m in \u001b[0;36m_execute\u001b[1;34m(self, command, params)\u001b[0m\n\u001b[0;32m    631\u001b[0m             \u001b[0mparams\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    632\u001b[0m         \u001b[0mparams\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'id'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_id\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 633\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_parent\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    634\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    635\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mfind_element\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mby\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mBy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mID\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mexecute\u001b[1;34m(self, driver_command, params)\u001b[0m\n\u001b[0;32m    319\u001b[0m         \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcommand_executor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdriver_command\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    320\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 321\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror_handler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcheck_response\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    322\u001b[0m             response['value'] = self._unwrap_value(\n\u001b[0;32m    323\u001b[0m                 response.get('value', None))\n",
      "\u001b[1;32mD:\\Anaconda\\lib\\site-packages\\selenium\\webdriver\\remote\\errorhandler.py\u001b[0m in \u001b[0;36mcheck_response\u001b[1;34m(self, response)\u001b[0m\n\u001b[0;32m    240\u001b[0m                 \u001b[0malert_text\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'alert'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'text'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    241\u001b[0m             \u001b[1;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscreen\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstacktrace\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0malert_text\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 242\u001b[1;33m         \u001b[1;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscreen\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstacktrace\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    243\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    244\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_value_or_default\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mobj\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdefault\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mElementNotInteractableException\u001b[0m: Message: element not interactable\n  (Session info: chrome=81.0.4044.138)\n"
     ]
    }
   ],
   "source": [
    "payload =  {\"account\": \"3347149824@qq.com\", \"password\": \"sl520ykYk\"}\n",
    "# payload =  {\"account\": \"NFUHacks@163.com\", \"password\": \"NFU706947580\"}\n",
    "driver.find_element_by_xpath('//div[@class=\"login__type__container login__type__container__scan\"]/a').click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "WebDriver 常用方法：\n",
    "* clear()清楚文本\n",
    "* send_keys(values)模拟按键输入\n",
    "* click()模拟点击\n",
    "* submit模拟提交"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"account\"]').clear()\n",
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"account\"]').send_keys(payload['account'])\n",
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"password\"]').clear()\n",
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"password\"]').send_keys(payload['password'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//div[@class=\"login_btn_panel\"]/a').click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 点选单"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "其他常用方法\n",
    "* size：返回元素的尺寸\n",
    "* text：获取元素的文本\n",
    "* get_attribute：获取属性值  &ensp;&ensp;&ensp; get_attribute('innerHTML')获取元素内的全部HTML\n",
    "* is_displayed()：设置该元素用户是否可见"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'展开'"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//a[@id=\"m_open\"]')\n",
    "element.click()\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.execute_script(\"window.scrollTo(0,document.body.scrollHeight)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://mp.weixin.qq.com/cgi-bin/appmsg?begin=0&count=10&t=media/appmsg_list&type=10&action=list&token=1827007293&lang=zh_CN'"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//li[@title[contains(.,\"素材管理\")]]/a') \n",
    "# main_content = element.get_attribute('innerHTML')\n",
    "# main_content\n",
    "url_素材管理= element.get_attribute(\"href\")\n",
    "url_素材管理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(url_素材管理)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 新建图文消息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//*[text()[contains(.,\"新建图文消息\")]]') \n",
    "main_content = element.get_attribute('innerHTML')\n",
    "main_content\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['CDwindow-469596259E3BCDCF8C4C341BD6630074', 'CDwindow-061D861B55D11CBE9C8233CF32CA4DC4']\n"
     ]
    }
   ],
   "source": [
    "print (driver.window_handles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 新建图文消息开了另一分视窗，所以要切换 switch_to \n",
    "driver.switch_to.window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 超链接"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                超链接              \n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//*[text()[contains(.,\"超链接\")]]') \n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "选择其他公众号\n"
     ]
    }
   ],
   "source": [
    "# 点 选择其他公众号\n",
    "element = driver.find_element_by_xpath('//*[text()[contains(.,\"选择其他公众号\")]]') \n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//form//div[@class=\"inner_link_account_area\"]//input[@class=\"weui-desktop-form__input\"]').clear()\n",
    "driver.find_element_by_xpath('//form//div[@class=\"inner_link_account_area\"]//input[@class=\"weui-desktop-form__input\"]').send_keys(公众号)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-icon weui-desktop-icon__inputSearch weui-desktop-icon__small\"><!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <svg width=\"16\" height=\"16\" viewBox=\"0 0 16 16\" xmlns=\"http://www.w3.org/2000/svg\"><path d=\"M11.33 10.007l4.273 4.273a.502.502 0 0 1 .005.709l-.585.584a.499.499 0 0 1-.709-.004L10.046 11.3a6.278 6.278 0 1 1 1.284-1.294zm.012-3.729a5.063 5.063 0 1 0-10.127 0 5.063 5.063 0 0 0 10.127 0z\"></path></svg> <!----> <!----> <!----> <!----></div>\n"
     ]
    }
   ],
   "source": [
    "# 点放大镜搜\n",
    "element = driver.find_element_by_xpath('//button[@class=\"weui-desktop-icon-btn weui-desktop-search__btn\"]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/VFHa9HgYHXomov15yQzibbRbq3O0icnz3FLKM56XfsY0B6J4efu0jUERnclVfp83zBWwuYRPqxMc1ZKnWCMq5ARA/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">厦门魔法阿嬷</strong> <i class=\"inner_link_account_wechat\">微信号：XIAMEN100</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/lPIMMp6nOcdictRRUN1B7LuqerH7Qy3uYNVm7H1WG0UXsKIaxFaTxrTpa95eO4Q1Xg3pIWAbXJrc4tOaic41Cjhg/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">泉州魔法阿嬷</strong> <i class=\"inner_link_account_wechat\">微信号：quanzhou717</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/ia5GdAIiaKwdhotOdiay0O4BrUnb7gul4pLoKSzRHhdk5HUrH9Jia7vCjKicRP6P1taPp4yhdEmL6vYxmrFaVeTT5PA/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">厦门青菜公公</strong> <i class=\"inner_link_account_wechat\">微信号：xiamen618</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/labF00icrWviazYjPOHbEN4yt9tEMBvluibkpYmmVI8ewnG1ZORdR08lEGn8DTvaXly5HG1co6xscibjpT0TcU2Ieg/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">创造与魔法手游</strong> <i class=\"inner_link_account_wechat\">微信号：czymf666</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/jiaBXqDpLPjQlFSWsSHmwphTFhOms7ekrB9Hy6I40SbP29Vbas8EzdDAQQGEIdmeHiaCdfKmg328Qaibz2TdFx1Jw/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">厦门阳少爷</strong> <i class=\"inner_link_account_wechat\">微信号：未设置</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">服务号</div></li>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"inner_link_account_list\"]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "公众号SERP = main_content\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 解析\n",
    "root = fromstring(公众号SERP) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "主 = root.xpath('//li[@class=\"inner_link_account_item\"]')\n",
    "\n",
    "account_list = []\n",
    "for e in 主:\n",
    "    account_nickname = e.xpath('./div/strong[@class=\"inner_link_account_nickname\"]')[0].text\n",
    "    account_wechat = e.xpath('./div/i[@class=\"inner_link_account_wechat\"]')[0].text\n",
    "    account_img = e.xpath('./div/img/@src')[0]\n",
    "    account = {\"nickname\": account_nickname, \"wechat\": account_wechat, \"img\": account_img,}\n",
    "    account_list.append(account)\n",
    "\n",
    "df_account = pd.DataFrame(account_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nickname</th>\n",
       "      <th>wechat</th>\n",
       "      <th>img</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>厦门魔法阿嬷</td>\n",
       "      <td>微信号：XIAMEN100</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/VFHa9HgYHXomov1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>泉州魔法阿嬷</td>\n",
       "      <td>微信号：quanzhou717</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/lPIMMp6nOcdictR...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>厦门青菜公公</td>\n",
       "      <td>微信号：xiamen618</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/ia5GdAIiaKwdhot...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>创造与魔法手游</td>\n",
       "      <td>微信号：czymf666</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/labF00icrWviazY...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>厦门阳少爷</td>\n",
       "      <td>微信号：未设置</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/jiaBXqDpLPjQlFS...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  nickname           wechat                                                img\n",
       "0   厦门魔法阿嬷    微信号：XIAMEN100  http://mmbiz.qpic.cn/mmbiz_png/VFHa9HgYHXomov1...\n",
       "1   泉州魔法阿嬷  微信号：quanzhou717  http://mmbiz.qpic.cn/mmbiz_png/lPIMMp6nOcdictR...\n",
       "2   厦门青菜公公    微信号：xiamen618  http://mmbiz.qpic.cn/mmbiz_png/ia5GdAIiaKwdhot...\n",
       "3  创造与魔法手游     微信号：czymf666  http://mmbiz.qpic.cn/mmbiz_png/labF00icrWviazY...\n",
       "4    厦门阳少爷          微信号：未设置  http://mmbiz.qpic.cn/mmbiz_png/jiaBXqDpLPjQlFS..."
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_account"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/VFHa9HgYHXomov15yQzibbRbq3O0icnz3FLKM56XfsY0B6J4efu0jUERnclVfp83zBWwuYRPqxMc1ZKnWCMq5ARA/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">厦门魔法阿嬷</strong> <i class=\"inner_link_account_wechat\">微信号：XIAMEN100</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"inner_link_account_list\"]/li')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\n跳转_input = driver.find_element_by_xpath(\\'//span[@class=\"weui-desktop-pagination__form\"]/input\\')\\n跳转_a = driver.find_element_by_xpath(\\'//span[@class=\"weui-desktop-pagination__form\"]/a\\')\\n跳转_input.clear()\\n跳转_input.send_keys(2)\\n跳转_a.click()\\n'"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 跳转testing\n",
    "'''\n",
    "跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "跳转_input.clear()\n",
    "跳转_input.send_keys(2)\n",
    "跳转_a.click()\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[9, 388]\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "# 跳转上限\n",
    "l_e = driver.find_elements_by_xpath('//label[@class=\"weui-desktop-pagination__num\"]')\n",
    "l_e_int  = [int(x.text) for x in l_e] \n",
    "print (l_e_int)\n",
    "print (l_e_int[0]==l_e_int[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388]\n"
     ]
    }
   ],
   "source": [
    "pages = list(range(l_e_int[0],l_e_int[-1]+1 ))\n",
    "#print(pages[0:2])\n",
    "pages = list(range(1,l_e_int[-1]+1 ))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 循环/遍历"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "# global varialbes \n",
    "html_raw = dict()\n",
    "main_content =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_pages (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "\n",
    "        跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "        跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "        跳转_input.clear()\n",
    "        跳转_input.send_keys(p)\n",
    "        跳转_a.click()\n",
    "\n",
    "        time.sleep(1+10*random())\n",
    "\n",
    "        element = driver.find_element_by_xpath('//div[@class=\"inner_link_article_list\"]')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "        #print(main_content)\n",
    "        html_raw[p] = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t17\t18\t19\t20\t21\t22\t23\t24\t25\t26\t27\t28\t29\t30\t31\t32\t33\t34\t35\t36\t37\t38\t39\t40\t41\t42\t43\t44\t45\t46\t47\t48\t49\t50\t51\t52\t53\t54\t55\t56\t57\t58\t59\t60\t61\t62\t63\t64\t65\t66\t67\t68\t69\t70\t71\t72\t73\t74\t75\t76\t77\t78\t79\t80\t81\t82\t83\t84\t85\t86\t87\t88\t89\t90\t91\t92\t93\t94\t95\t96\t97\t98\t99\t100\t101\t102\t103\t104\t105\t106\t107\t108\t109\t110\t111\t112\t113\t114\t115\t116\t117\t118\t119\t120\t121\t122\t123\t124\t125\t126\t127\t128\t129\t130\t131\t132\t133\t134\t135\t136\t137\t138\t139\t140\t141\t142\t143\t144\t145\t146\t147\t148\t149\t150\t151\t152\t153\t154\t155\t156\t157\t158\t159\t160\t161\t162\t163\t164\t165\t166\t167\t168\t169\t170\t171\t172\t173\t174\t175\t176\t177\t178\t179\t180\t181\t182\t183\t184\t185\t186\t187\t188\t189\t190\t191\t192\t193\t194\t195\t196\t197\t198\t199\t200\t201\t202\t203\t204\t205\t206\t207\t208\t209\t210\t211\t212\t213\t214\t215\t216\t217\t218\t219\t220\t221\t222\t223\t224\t225\t226\t227\t228\t229\t230\t231\t232\t233\t234\t235\t236\t237\t238\t239\t240\t241\t242\t243\t244\t245\t246\t247\t248\t249\t250\t251\t252\t253\t254\t255\t256\t257\t258\t259\t260\t261\t262\t263\t264\t265\t266\t267\t268\t269\t270\t271\t272\t273\t274\t275\t276\t277\t278\t279\t280\t281\t282\t283\t284\t285\t286\t287\t288\t289\t290\t291\t292\t293\t294\t295\t296\t297\t298\t299\t300\t301\t302\t303\t304\t305\t306\t307\t308\t309\t310\t311\t312\t313\t314\t315\t316\t317\t318\t319\t320\t321\t322\t323\t324\t325\t326\t327\t328\t329\t330\t331\t332\t333\t334\t335\t"
     ]
    },
    {
     "ename": "NoSuchElementException",
     "evalue": "Message: no such element: Unable to locate element: {\"method\":\"xpath\",\"selector\":\"//div[@class=\"inner_link_article_list\"]\"}\n  (Session info: chrome=81.0.4044.138)\n",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNoSuchElementException\u001b[0m                    Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-45-ffac2da82a61>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mprocess_pages\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpages\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32m<ipython-input-43-5a6165b2dfd8>\u001b[0m in \u001b[0;36mprocess_pages\u001b[1;34m(pages)\u001b[0m\n\u001b[0;32m     11\u001b[0m         \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m+\u001b[0m\u001b[1;36m10\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0mrandom\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     12\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 13\u001b[1;33m         \u001b[0melement\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_element_by_xpath\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'//div[@class=\"inner_link_article_list\"]'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     14\u001b[0m         \u001b[0mmain_content\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0melement\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_attribute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'innerHTML'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     15\u001b[0m         \u001b[1;31m#print(main_content)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mfind_element_by_xpath\u001b[1;34m(self, xpath)\u001b[0m\n\u001b[0;32m    392\u001b[0m             \u001b[0melement\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_element_by_xpath\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'//div/td[1]'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    393\u001b[0m         \"\"\"\n\u001b[1;32m--> 394\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_element\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mby\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mBy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mXPATH\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mxpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    395\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    396\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mfind_elements_by_xpath\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mxpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mfind_element\u001b[1;34m(self, by, value)\u001b[0m\n\u001b[0;32m    976\u001b[0m         return self.execute(Command.FIND_ELEMENT, {\n\u001b[0;32m    977\u001b[0m             \u001b[1;34m'using'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mby\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 978\u001b[1;33m             'value': value})['value']\n\u001b[0m\u001b[0;32m    979\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    980\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mfind_elements\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mby\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mBy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mID\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mexecute\u001b[1;34m(self, driver_command, params)\u001b[0m\n\u001b[0;32m    319\u001b[0m         \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcommand_executor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdriver_command\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    320\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 321\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror_handler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcheck_response\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    322\u001b[0m             response['value'] = self._unwrap_value(\n\u001b[0;32m    323\u001b[0m                 response.get('value', None))\n",
      "\u001b[1;32mD:\\Anaconda\\lib\\site-packages\\selenium\\webdriver\\remote\\errorhandler.py\u001b[0m in \u001b[0;36mcheck_response\u001b[1;34m(self, response)\u001b[0m\n\u001b[0;32m    240\u001b[0m                 \u001b[0malert_text\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'alert'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'text'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    241\u001b[0m             \u001b[1;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscreen\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstacktrace\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0malert_text\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 242\u001b[1;33m         \u001b[1;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscreen\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstacktrace\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    243\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    244\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_value_or_default\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mobj\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdefault\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mNoSuchElementException\u001b[0m: Message: no such element: Unable to locate element: {\"method\":\"xpath\",\"selector\":\"//div[@class=\"inner_link_article_list\"]\"}\n  (Session info: chrome=81.0.4044.138)\n"
     ]
    }
   ],
   "source": [
    "process_pages(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>330</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>331</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>332</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>333</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>334</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>334 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         html_snippets\n",
       "1    <div><label class=\"inner_link_article_item\"><s...\n",
       "2    <div><label class=\"inner_link_article_item\"><s...\n",
       "3    <div><label class=\"inner_link_article_item\"><s...\n",
       "4    <div><label class=\"inner_link_article_item\"><s...\n",
       "5    <div><label class=\"inner_link_article_item\"><s...\n",
       "..                                                 ...\n",
       "330  <div><label class=\"inner_link_article_item\"><s...\n",
       "331  <div><label class=\"inner_link_article_item\"><s...\n",
       "332  <div><label class=\"inner_link_article_item\"><s...\n",
       "333  <div><label class=\"inner_link_article_item\"><s...\n",
       "334  <div><label class=\"inner_link_article_item\"><s...\n",
       "\n",
       "[334 rows x 1 columns]"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame([html_raw]).T\n",
    "df.columns = [\"html_snippets\"]\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Stored 'html_raw' (dict)\n"
     ]
    }
   ],
   "source": [
    "%store html_raw\n",
    "import pickle \n",
    "filehandler = open(\"html_raw\", 'wb') \n",
    "pickle.dump(html_raw, filehandler)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "26\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>305</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>306</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>307</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>308</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>309</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>308 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         html_snippets\n",
       "2    <div><label class=\"inner_link_article_item\"><s...\n",
       "3    <div><label class=\"inner_link_article_item\"><s...\n",
       "4    <div><label class=\"inner_link_article_item\"><s...\n",
       "5    <div><label class=\"inner_link_article_item\"><s...\n",
       "6    <div><label class=\"inner_link_article_item\"><s...\n",
       "..                                                 ...\n",
       "305  <div><label class=\"inner_link_article_item\"><s...\n",
       "306  <div><label class=\"inner_link_article_item\"><s...\n",
       "307  <div><label class=\"inner_link_article_item\"><s...\n",
       "308  <div><label class=\"inner_link_article_item\"><s...\n",
       "309  <div><label class=\"inner_link_article_item\"><s...\n",
       "\n",
       "[308 rows x 1 columns]"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_out = df[~df.duplicated()]\n",
    "print (len(df_out))\n",
    "df[df.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[2,\n",
       " 3,\n",
       " 4,\n",
       " 5,\n",
       " 6,\n",
       " 7,\n",
       " 8,\n",
       " 9,\n",
       " 10,\n",
       " 11,\n",
       " 12,\n",
       " 13,\n",
       " 14,\n",
       " 15,\n",
       " 16,\n",
       " 17,\n",
       " 18,\n",
       " 19,\n",
       " 20,\n",
       " 21,\n",
       " 22,\n",
       " 23,\n",
       " 24,\n",
       " 25,\n",
       " 26,\n",
       " 27,\n",
       " 28,\n",
       " 29,\n",
       " 30,\n",
       " 31,\n",
       " 32,\n",
       " 33,\n",
       " 34,\n",
       " 35,\n",
       " 36,\n",
       " 37,\n",
       " 38,\n",
       " 39,\n",
       " 40,\n",
       " 41,\n",
       " 42,\n",
       " 43,\n",
       " 44,\n",
       " 45,\n",
       " 46,\n",
       " 47,\n",
       " 48,\n",
       " 49,\n",
       " 50,\n",
       " 51,\n",
       " 52,\n",
       " 53,\n",
       " 54,\n",
       " 55,\n",
       " 56,\n",
       " 57,\n",
       " 58,\n",
       " 59,\n",
       " 60,\n",
       " 61,\n",
       " 62,\n",
       " 63,\n",
       " 64,\n",
       " 65,\n",
       " 66,\n",
       " 67,\n",
       " 68,\n",
       " 69,\n",
       " 70,\n",
       " 71,\n",
       " 72,\n",
       " 73,\n",
       " 74,\n",
       " 75,\n",
       " 76,\n",
       " 77,\n",
       " 78,\n",
       " 79,\n",
       " 80,\n",
       " 81,\n",
       " 82,\n",
       " 83,\n",
       " 84,\n",
       " 85,\n",
       " 86,\n",
       " 87,\n",
       " 88,\n",
       " 89,\n",
       " 90,\n",
       " 91,\n",
       " 92,\n",
       " 93,\n",
       " 94,\n",
       " 95,\n",
       " 96,\n",
       " 97,\n",
       " 98,\n",
       " 99,\n",
       " 100,\n",
       " 101,\n",
       " 102,\n",
       " 103,\n",
       " 104,\n",
       " 105,\n",
       " 106,\n",
       " 107,\n",
       " 108,\n",
       " 109,\n",
       " 110,\n",
       " 111,\n",
       " 112,\n",
       " 113,\n",
       " 114,\n",
       " 115,\n",
       " 116,\n",
       " 117,\n",
       " 118,\n",
       " 119,\n",
       " 120,\n",
       " 121,\n",
       " 122,\n",
       " 123,\n",
       " 124,\n",
       " 125,\n",
       " 126,\n",
       " 127,\n",
       " 128,\n",
       " 129,\n",
       " 130,\n",
       " 131,\n",
       " 132,\n",
       " 133,\n",
       " 134,\n",
       " 135,\n",
       " 136,\n",
       " 137,\n",
       " 138,\n",
       " 139,\n",
       " 140,\n",
       " 141,\n",
       " 142,\n",
       " 143,\n",
       " 144,\n",
       " 145,\n",
       " 146,\n",
       " 147,\n",
       " 148,\n",
       " 149,\n",
       " 150,\n",
       " 151,\n",
       " 152,\n",
       " 153,\n",
       " 154,\n",
       " 155,\n",
       " 156,\n",
       " 157,\n",
       " 158,\n",
       " 159,\n",
       " 160,\n",
       " 161,\n",
       " 162,\n",
       " 163,\n",
       " 164,\n",
       " 165,\n",
       " 166,\n",
       " 167,\n",
       " 168,\n",
       " 169,\n",
       " 170,\n",
       " 171,\n",
       " 172,\n",
       " 173,\n",
       " 174,\n",
       " 175,\n",
       " 176,\n",
       " 177,\n",
       " 178,\n",
       " 179,\n",
       " 180,\n",
       " 181,\n",
       " 182,\n",
       " 183,\n",
       " 184,\n",
       " 185,\n",
       " 186,\n",
       " 187,\n",
       " 188,\n",
       " 189,\n",
       " 190,\n",
       " 191,\n",
       " 192,\n",
       " 193,\n",
       " 194,\n",
       " 195,\n",
       " 196,\n",
       " 197,\n",
       " 198,\n",
       " 199,\n",
       " 200,\n",
       " 201,\n",
       " 202,\n",
       " 203,\n",
       " 204,\n",
       " 205,\n",
       " 206,\n",
       " 207,\n",
       " 208,\n",
       " 209,\n",
       " 210,\n",
       " 211,\n",
       " 212,\n",
       " 213,\n",
       " 214,\n",
       " 215,\n",
       " 216,\n",
       " 217,\n",
       " 218,\n",
       " 219,\n",
       " 220,\n",
       " 221,\n",
       " 222,\n",
       " 223,\n",
       " 224,\n",
       " 225,\n",
       " 226,\n",
       " 227,\n",
       " 228,\n",
       " 229,\n",
       " 230,\n",
       " 231,\n",
       " 232,\n",
       " 233,\n",
       " 234,\n",
       " 235,\n",
       " 236,\n",
       " 237,\n",
       " 238,\n",
       " 239,\n",
       " 240,\n",
       " 241,\n",
       " 242,\n",
       " 243,\n",
       " 244,\n",
       " 245,\n",
       " 246,\n",
       " 247,\n",
       " 248,\n",
       " 249,\n",
       " 250,\n",
       " 251,\n",
       " 252,\n",
       " 253,\n",
       " 254,\n",
       " 255,\n",
       " 256,\n",
       " 257,\n",
       " 258,\n",
       " 259,\n",
       " 260,\n",
       " 261,\n",
       " 262,\n",
       " 263,\n",
       " 264,\n",
       " 265,\n",
       " 266,\n",
       " 267,\n",
       " 268,\n",
       " 269,\n",
       " 270,\n",
       " 271,\n",
       " 272,\n",
       " 273,\n",
       " 274,\n",
       " 275,\n",
       " 276,\n",
       " 277,\n",
       " 278,\n",
       " 279,\n",
       " 280,\n",
       " 281,\n",
       " 282,\n",
       " 283,\n",
       " 284,\n",
       " 285,\n",
       " 286,\n",
       " 287,\n",
       " 288,\n",
       " 289,\n",
       " 290,\n",
       " 291,\n",
       " 292,\n",
       " 293,\n",
       " 294,\n",
       " 295,\n",
       " 296,\n",
       " 297,\n",
       " 298,\n",
       " 299,\n",
       " 300,\n",
       " 301,\n",
       " 302,\n",
       " 303,\n",
       " 304,\n",
       " 305,\n",
       " 306,\n",
       " 307,\n",
       " 308,\n",
       " 309,\n",
       " 384,\n",
       " 385,\n",
       " 386,\n",
       " 387,\n",
       " 388,\n",
       " 335,\n",
       " 336,\n",
       " 337,\n",
       " 338,\n",
       " 339,\n",
       " 340,\n",
       " 341,\n",
       " 342,\n",
       " 343,\n",
       " 344,\n",
       " 345,\n",
       " 346,\n",
       " 347,\n",
       " 348,\n",
       " 349,\n",
       " 350,\n",
       " 351,\n",
       " 352,\n",
       " 353,\n",
       " 354,\n",
       " 355,\n",
       " 356,\n",
       " 357,\n",
       " 358,\n",
       " 359,\n",
       " 360,\n",
       " 361,\n",
       " 362,\n",
       " 363,\n",
       " 364,\n",
       " 365,\n",
       " 366,\n",
       " 367,\n",
       " 368,\n",
       " 369,\n",
       " 370,\n",
       " 371,\n",
       " 372,\n",
       " 373,\n",
       " 374,\n",
       " 375,\n",
       " 376,\n",
       " 377,\n",
       " 378,\n",
       " 379,\n",
       " 380,\n",
       " 381,\n",
       " 382,\n",
       " 383]"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "try_again = list(df[df.duplicated()].index)\n",
    "print(try_again)\n",
    "try_again = try_again + list (set(pages).difference(set(df.index.values)))\n",
    "try_again"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 暂存档"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = fn [\"output\"] [\"公众号_htm_snippets\"] \n",
    "df_out.to_csv(filename.format(公众号=公众号), sep=\"\\t\", encoding=\"utf8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,11,17,14,16,16,12,6,8,7,11,10,11,14,14,8,7,7,6,6,5,7,5,6,5,6,"
     ]
    },
    {
     "ename": "KeyError",
     "evalue": "335",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[1;32mD:\\Anaconda\\lib\\site-packages\\pandas\\core\\indexes\\base.py\u001b[0m in \u001b[0;36mget_loc\u001b[1;34m(self, key, method, tolerance)\u001b[0m\n\u001b[0;32m   2645\u001b[0m             \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2646\u001b[1;33m                 \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   2647\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mpandas\\_libs\\index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;32mpandas\\_libs\\index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.Int64HashTable.get_item\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.Int64HashTable.get_item\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;31mKeyError\u001b[0m: 335",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-53-630ad00d9287>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      9\u001b[0m \u001b[0ml_df\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     10\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mp\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mpages\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 11\u001b[1;33m     \u001b[0m_df_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparse_html_snippets\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mp\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m\"html_snippets\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     12\u001b[0m     \u001b[0mprint\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_df_\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mend\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\",\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     13\u001b[0m     \u001b[0ml_df\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_df_\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\lib\\site-packages\\pandas\\core\\indexing.py\u001b[0m in \u001b[0;36m__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m   1760\u001b[0m                 \u001b[1;32mexcept\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mKeyError\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mAttributeError\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1761\u001b[0m                     \u001b[1;32mpass\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1762\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_getitem_tuple\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1763\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1764\u001b[0m             \u001b[1;31m# we by definition only have the 0th axis\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\lib\\site-packages\\pandas\\core\\indexing.py\u001b[0m in \u001b[0;36m_getitem_tuple\u001b[1;34m(self, tup)\u001b[0m\n\u001b[0;32m   1270\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_getitem_tuple\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtup\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mTuple\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1271\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1272\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_getitem_lowerdim\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtup\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1273\u001b[0m         \u001b[1;32mexcept\u001b[0m \u001b[0mIndexingError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1274\u001b[0m             \u001b[1;32mpass\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\lib\\site-packages\\pandas\\core\\indexing.py\u001b[0m in \u001b[0;36m_getitem_lowerdim\u001b[1;34m(self, tup)\u001b[0m\n\u001b[0;32m   1387\u001b[0m         \u001b[1;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkey\u001b[0m \u001b[1;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtup\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1388\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mis_label_like\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtuple\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1389\u001b[1;33m                 \u001b[0msection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_getitem_axis\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1390\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1391\u001b[0m                 \u001b[1;31m# we have yielded a scalar ?\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\lib\\site-packages\\pandas\\core\\indexing.py\u001b[0m in \u001b[0;36m_getitem_axis\u001b[1;34m(self, key, axis)\u001b[0m\n\u001b[0;32m   1963\u001b[0m         \u001b[1;31m# fall thru to straight lookup\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1964\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_key\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1965\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_label\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1966\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1967\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\lib\\site-packages\\pandas\\core\\indexing.py\u001b[0m in \u001b[0;36m_get_label\u001b[1;34m(self, label, axis)\u001b[0m\n\u001b[0;32m    623\u001b[0m             \u001b[1;32mraise\u001b[0m \u001b[0mIndexingError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"no slices here, handle elsewhere\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    624\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 625\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_xs\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlabel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    626\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    627\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_get_loc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mint\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mint\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36mxs\u001b[1;34m(self, key, axis, level, drop_level)\u001b[0m\n\u001b[0;32m   3535\u001b[0m             \u001b[0mloc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnew_index\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_loc_level\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdrop_level\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdrop_level\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3536\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3537\u001b[1;33m             \u001b[0mloc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   3538\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3539\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mndarray\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\lib\\site-packages\\pandas\\core\\indexes\\base.py\u001b[0m in \u001b[0;36mget_loc\u001b[1;34m(self, key, method, tolerance)\u001b[0m\n\u001b[0;32m   2646\u001b[0m                 \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2647\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2648\u001b[1;33m                 \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_maybe_cast_indexer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   2649\u001b[0m         \u001b[0mindexer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_indexer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtolerance\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mtolerance\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2650\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mindexer\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mndim\u001b[0m \u001b[1;33m>\u001b[0m \u001b[1;36m1\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mindexer\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msize\u001b[0m \u001b[1;33m>\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mpandas\\_libs\\index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;32mpandas\\_libs\\index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.Int64HashTable.get_item\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.Int64HashTable.get_item\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;31mKeyError\u001b[0m: 335"
     ]
    }
   ],
   "source": [
    "def parse_html_snippets(_snippet_):\n",
    "    root = fromstring(_snippet_) \n",
    "    title = [x.text for x in root.xpath('//div[@class=\"inner_link_article_title\"]')]\n",
    "    create_time = [x.text for x in root.xpath('//div[@class=\"inner_link_article_date\"]')]\n",
    "    link = [x for x in root.xpath('//a/@href')]\n",
    "    _df_ = pd.DataFrame({\"title\":title, \"create_time\": create_time, \"link\":link})\n",
    "    return(_df_)\n",
    "    \n",
    "l_df = []\n",
    "for p in pages:\n",
    "    _df_ = parse_html_snippets(df.loc[p,\"html_snippets\"])\n",
    "    print (len(_df_), end=\",\")\n",
    "    l_df.append(_df_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>带你飞起来的超HOT健身馆！趣味燃脂减压时尚，厦门绝无仅有的室内蹦极！</td>\n",
       "      <td>2019-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>【旺巴蜀】限时2.99折 ！100+种口味任意撸！人均10元+吃掉一座山！</td>\n",
       "      <td>2019-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>人均25＋！一口锅吃下一整头牛 ！现切牛肉片片都是粘人精 ！</td>\n",
       "      <td>2019-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>LV设计总监亲自操刀，万象城的艺术餐厅，超精致 ！</td>\n",
       "      <td>2019-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2 层楼的\"小香港\"，广东人扎堆去，这次终于被我找到了 ！</td>\n",
       "      <td>2019-03-22</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>336元双人酒店西餐免费送 ！厦门探秘圣地，就藏在这片「都会桃源」之中 ！</td>\n",
       "      <td>2019-03-22</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0°C创意铜锅，锅底吃到饱，厦门找不到第二家 ！</td>\n",
       "      <td>2019-03-22</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>SM这家壕气冲天的「日料放题」！刺身、雪花牛肉、鹅肝…各种尖货无限畅吃 ！</td>\n",
       "      <td>2019-03-22</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>宝龙一城这家超梦幻的玻璃花房竟然是家【燕窝博物馆】！美得仿佛坠入仙境！</td>\n",
       "      <td>2019-03-22</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>万象城“寿司”专门店 ！在日本火了50+年 ！双人套餐免费抢 ！</td>\n",
       "      <td>2019-03-21</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>火了26年的新疆菜， 招牌50CM长，3个人都吃不完！</td>\n",
       "      <td>2019-03-21</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                    title create_time  \\\n",
       "0     带你飞起来的超HOT健身馆！趣味燃脂减压时尚，厦门绝无仅有的室内蹦极！  2019-03-23   \n",
       "1   【旺巴蜀】限时2.99折 ！100+种口味任意撸！人均10元+吃掉一座山！  2019-03-23   \n",
       "2          人均25＋！一口锅吃下一整头牛 ！现切牛肉片片都是粘人精 ！  2019-03-23   \n",
       "3               LV设计总监亲自操刀，万象城的艺术餐厅，超精致 ！  2019-03-23   \n",
       "4           2 层楼的\"小香港\"，广东人扎堆去，这次终于被我找到了 ！  2019-03-22   \n",
       "5   336元双人酒店西餐免费送 ！厦门探秘圣地，就藏在这片「都会桃源」之中 ！  2019-03-22   \n",
       "6                0°C创意铜锅，锅底吃到饱，厦门找不到第二家 ！  2019-03-22   \n",
       "7   SM这家壕气冲天的「日料放题」！刺身、雪花牛肉、鹅肝…各种尖货无限畅吃 ！  2019-03-22   \n",
       "8     宝龙一城这家超梦幻的玻璃花房竟然是家【燕窝博物馆】！美得仿佛坠入仙境！  2019-03-22   \n",
       "9        万象城“寿司”专门店 ！在日本火了50+年 ！双人套餐免费抢 ！  2019-03-21   \n",
       "10            火了26年的新疆菜， 招牌50CM长，3个人都吃不完！  2019-03-21   \n",
       "\n",
       "                                                 link  \n",
       "0   http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "1   http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "2   http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "3   http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "4   http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "5   http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "6   http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "7   http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "8   http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "9   http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "10  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  "
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out = pd.concat(l_df).reset_index(drop=True)\n",
    "df_url_out.loc[0:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>7646</th>\n",
       "      <td>立减 688 元 | 【厦门海景千禧大酒店】婚宴春季特惠，火爆预定中 ！</td>\n",
       "      <td>2016-03-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7647</th>\n",
       "      <td>厦门美食地图（二） | 带你发现一个吃货的世外桃源 - 杏林 ！</td>\n",
       "      <td>2016-03-14</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7648</th>\n",
       "      <td>发现厦门 | 一位旗袍美女引发的厦门老百货回忆思潮 ...</td>\n",
       "      <td>2016-03-08</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7649</th>\n",
       "      <td>榴莲控最爱 | 必胜客【泰国金枕榴莲披萨】只要39元 ！</td>\n",
       "      <td>2016-03-07</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7650</th>\n",
       "      <td>发现厦门 | 她在沙坡尾开了六年独立书店，面朝大海，气定神闲 ...</td>\n",
       "      <td>2016-03-03</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                     title create_time  \\\n",
       "7646  立减 688 元 | 【厦门海景千禧大酒店】婚宴春季特惠，火爆预定中 ！  2016-03-16   \n",
       "7647      厦门美食地图（二） | 带你发现一个吃货的世外桃源 - 杏林 ！  2016-03-14   \n",
       "7648         发现厦门 | 一位旗袍美女引发的厦门老百货回忆思潮 ...  2016-03-08   \n",
       "7649          榴莲控最爱 | 必胜客【泰国金枕榴莲披萨】只要39元 ！  2016-03-07   \n",
       "7650    发现厦门 | 她在沙坡尾开了六年独立书店，面朝大海，气定神闲 ...  2016-03-03   \n",
       "\n",
       "                                                   link  \n",
       "7646  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "7647  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "7648  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "7649  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "7650  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  "
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out.tail(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>value</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>人均25＋！一口锅吃下一整头牛 ！现切牛肉片片都是粘人精 ！</td>\n",
       "      <td>2019-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>LV设计总监亲自操刀，万象城的艺术餐厅，超精致 ！</td>\n",
       "      <td>2019-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0°C创意铜锅，锅底吃到饱，厦门找不到第二家 ！</td>\n",
       "      <td>2019-03-22</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>SM这家壕气冲天的「日料放题」！刺身、雪花牛肉、鹅肝…各种尖货无限畅吃 ！</td>\n",
       "      <td>2019-03-22</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>宝龙一城这家超梦幻的玻璃花房竟然是家【燕窝博物馆】！美得仿佛坠入仙境！</td>\n",
       "      <td>2019-03-22</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7646</th>\n",
       "      <td>立减 688 元 | 【厦门海景千禧大酒店】婚宴春季特惠，火爆预定中 ！</td>\n",
       "      <td>2016-03-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7647</th>\n",
       "      <td>厦门美食地图（二） | 带你发现一个吃货的世外桃源 - 杏林 ！</td>\n",
       "      <td>2016-03-14</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7648</th>\n",
       "      <td>发现厦门 | 一位旗袍美女引发的厦门老百货回忆思潮 ...</td>\n",
       "      <td>2016-03-08</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7649</th>\n",
       "      <td>榴莲控最爱 | 必胜客【泰国金枕榴莲披萨】只要39元 ！</td>\n",
       "      <td>2016-03-07</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7650</th>\n",
       "      <td>发现厦门 | 她在沙坡尾开了六年独立书店，面朝大海，气定神闲 ...</td>\n",
       "      <td>2016-03-03</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3216 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       title create_time  \\\n",
       "value                                                      \n",
       "2             人均25＋！一口锅吃下一整头牛 ！现切牛肉片片都是粘人精 ！  2019-03-23   \n",
       "3                  LV设计总监亲自操刀，万象城的艺术餐厅，超精致 ！  2019-03-23   \n",
       "6                   0°C创意铜锅，锅底吃到饱，厦门找不到第二家 ！  2019-03-22   \n",
       "7      SM这家壕气冲天的「日料放题」！刺身、雪花牛肉、鹅肝…各种尖货无限畅吃 ！  2019-03-22   \n",
       "8        宝龙一城这家超梦幻的玻璃花房竟然是家【燕窝博物馆】！美得仿佛坠入仙境！  2019-03-22   \n",
       "...                                      ...         ...   \n",
       "7646    立减 688 元 | 【厦门海景千禧大酒店】婚宴春季特惠，火爆预定中 ！  2016-03-16   \n",
       "7647        厦门美食地图（二） | 带你发现一个吃货的世外桃源 - 杏林 ！  2016-03-14   \n",
       "7648           发现厦门 | 一位旗袍美女引发的厦门老百货回忆思潮 ...  2016-03-08   \n",
       "7649            榴莲控最爱 | 必胜客【泰国金枕榴莲披萨】只要39元 ！  2016-03-07   \n",
       "7650      发现厦门 | 她在沙坡尾开了六年独立书店，面朝大海，气定神闲 ...  2016-03-03   \n",
       "\n",
       "                                                    link  \n",
       "value                                                     \n",
       "2      http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "3      http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "6      http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "7      http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "8      http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "...                                                  ...  \n",
       "7646   http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "7647   http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "7648   http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "7649   http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "7650   http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "\n",
       "[3216 rows x 3 columns]"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# tagging 标记\n",
    "tagging_list = [\"\",\"厦门\", \"美食\", \"性价比\",\"烤肉\",\"小龙虾\",\"好吃\",\"火锅\",\"超值\",\\\n",
    "                \"中山路\",\"鼓浪屿\",\\\n",
    "                \"攻略\",\"福利\",\"明发\",\"万象城\",\"健身\",\\\n",
    "                \"咖啡\",\"奶茶\",\"朋友\",\"便宜\",\"\",\"送\",\\\n",
    "                \"中华城\", \"爽\",\"免费\",\"大礼\",\"来\", \"香\",\"出炉\",\"清单\",\\\n",
    "                \"料理\", \"海鲜\",\\\n",
    "                \"折\",\\\n",
    "                ] #overwritable\n",
    "\n",
    "v_v_list = []\n",
    "\n",
    "for tag in tagging_list:\n",
    "    index_list = df_url_out [ df_url_out.title.str.contains(tag) ].index.tolist()\n",
    "    v_v_pairs = pd.DataFrame({tag:index_list}).melt().set_index(\"value\")\n",
    "    v_v_list.append(v_v_pairs)\n",
    "\n",
    "df_cat = v_v_list[0]\n",
    "for d in v_v_list:\n",
    "    df_cat.update(d)\n",
    "    \n",
    "# 尚未标记内容\n",
    "df_url_out.loc [ df_cat.query('variable==\"\"').index ]"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "df_url_out.loc[53].link"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>带你飞起来的超HOT健身馆！趣味燃脂减压时尚，厦门绝无仅有的室内蹦极！</td>\n",
       "      <td>2019-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>【旺巴蜀】限时2.99折 ！100+种口味任意撸！人均10元+吃掉一座山！</td>\n",
       "      <td>2019-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>人均25＋！一口锅吃下一整头牛 ！现切牛肉片片都是粘人精 ！</td>\n",
       "      <td>2019-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>LV设计总监亲自操刀，万象城的艺术餐厅，超精致 ！</td>\n",
       "      <td>2019-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>2 层楼的\"小香港\"，广东人扎堆去，这次终于被我找到了 ！</td>\n",
       "      <td>2019-03-22</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7411</th>\n",
       "      <td>必胜客又开挂！把「钻石」放在披萨上，还有着会喷汁的布拉塔芝士！</td>\n",
       "      <td>2019-03-19</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7412</th>\n",
       "      <td>Thank u mom【草莓炸鸡】来啦 ！只卖30天，错过等一年 ！</td>\n",
       "      <td>2019-03-19</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7413</th>\n",
       "      <td>45元！洗吹造型+日系合资护理 ！实力派坐镇，进店秒变男女神 ！</td>\n",
       "      <td>2019-03-19</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7414</th>\n",
       "      <td>5 折！轻奢级创意料理餐厅！9.1分点评网久居粤菜头牌 ！</td>\n",
       "      <td>2019-03-19</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7415</th>\n",
       "      <td>老店 | 26年的“厕所大排档\"，两代人的传承，双人餐仅需78元...</td>\n",
       "      <td>2019-03-19</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>7392 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                      title create_time  \\\n",
       "24      带你飞起来的超HOT健身馆！趣味燃脂减压时尚，厦门绝无仅有的室内蹦极！  2019-03-23   \n",
       "25    【旺巴蜀】限时2.99折 ！100+种口味任意撸！人均10元+吃掉一座山！  2019-03-23   \n",
       "26           人均25＋！一口锅吃下一整头牛 ！现切牛肉片片都是粘人精 ！  2019-03-23   \n",
       "27                LV设计总监亲自操刀，万象城的艺术餐厅，超精致 ！  2019-03-23   \n",
       "28            2 层楼的\"小香港\"，广东人扎堆去，这次终于被我找到了 ！  2019-03-22   \n",
       "...                                     ...         ...   \n",
       "7411        必胜客又开挂！把「钻石」放在披萨上，还有着会喷汁的布拉塔芝士！  2019-03-19   \n",
       "7412     Thank u mom【草莓炸鸡】来啦 ！只卖30天，错过等一年 ！  2019-03-19   \n",
       "7413       45元！洗吹造型+日系合资护理 ！实力派坐镇，进店秒变男女神 ！  2019-03-19   \n",
       "7414          5 折！轻奢级创意料理餐厅！9.1分点评网久居粤菜头牌 ！  2019-03-19   \n",
       "7415    老店 | 26年的“厕所大排档\"，两代人的传承，双人餐仅需78元...  2019-03-19   \n",
       "\n",
       "                                                   link  \n",
       "24    http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "25    http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "26    http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "27    http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "28    http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "...                                                 ...  \n",
       "7411  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "7412  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "7413  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "7414  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "7415  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "\n",
       "[7392 rows x 3 columns]"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out[df_url_out.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>带你飞起来的超HOT健身馆！趣味燃脂减压时尚，厦门绝无仅有的室内蹦极！</td>\n",
       "      <td>2019-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>【旺巴蜀】限时2.99折 ！100+种口味任意撸！人均10元+吃掉一座山！</td>\n",
       "      <td>2019-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>人均25＋！一口锅吃下一整头牛 ！现切牛肉片片都是粘人精 ！</td>\n",
       "      <td>2019-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>LV设计总监亲自操刀，万象城的艺术餐厅，超精致 ！</td>\n",
       "      <td>2019-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2 层楼的\"小香港\"，广东人扎堆去，这次终于被我找到了 ！</td>\n",
       "      <td>2019-03-22</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7646</th>\n",
       "      <td>立减 688 元 | 【厦门海景千禧大酒店】婚宴春季特惠，火爆预定中 ！</td>\n",
       "      <td>2016-03-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7647</th>\n",
       "      <td>厦门美食地图（二） | 带你发现一个吃货的世外桃源 - 杏林 ！</td>\n",
       "      <td>2016-03-14</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7648</th>\n",
       "      <td>发现厦门 | 一位旗袍美女引发的厦门老百货回忆思潮 ...</td>\n",
       "      <td>2016-03-08</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7649</th>\n",
       "      <td>榴莲控最爱 | 必胜客【泰国金枕榴莲披萨】只要39元 ！</td>\n",
       "      <td>2016-03-07</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7650</th>\n",
       "      <td>发现厦门 | 她在沙坡尾开了六年独立书店，面朝大海，气定神闲 ...</td>\n",
       "      <td>2016-03-03</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>259 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                      title create_time  \\\n",
       "0       带你飞起来的超HOT健身馆！趣味燃脂减压时尚，厦门绝无仅有的室内蹦极！  2019-03-23   \n",
       "1     【旺巴蜀】限时2.99折 ！100+种口味任意撸！人均10元+吃掉一座山！  2019-03-23   \n",
       "2            人均25＋！一口锅吃下一整头牛 ！现切牛肉片片都是粘人精 ！  2019-03-23   \n",
       "3                 LV设计总监亲自操刀，万象城的艺术餐厅，超精致 ！  2019-03-23   \n",
       "4             2 层楼的\"小香港\"，广东人扎堆去，这次终于被我找到了 ！  2019-03-22   \n",
       "...                                     ...         ...   \n",
       "7646   立减 688 元 | 【厦门海景千禧大酒店】婚宴春季特惠，火爆预定中 ！  2016-03-16   \n",
       "7647       厦门美食地图（二） | 带你发现一个吃货的世外桃源 - 杏林 ！  2016-03-14   \n",
       "7648          发现厦门 | 一位旗袍美女引发的厦门老百货回忆思潮 ...  2016-03-08   \n",
       "7649           榴莲控最爱 | 必胜客【泰国金枕榴莲披萨】只要39元 ！  2016-03-07   \n",
       "7650     发现厦门 | 她在沙坡尾开了六年独立书店，面朝大海，气定神闲 ...  2016-03-03   \n",
       "\n",
       "                                                   link  \n",
       "0     http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "1     http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "2     http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "3     http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "4     http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "...                                                 ...  \n",
       "7646  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "7647  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "7648  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "7649  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "7650  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...  \n",
       "\n",
       "[259 rows x 3 columns]"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out[~df_url_out.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>variable</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>带你飞起来的超HOT健身馆！趣味燃脂减压时尚，厦门绝无仅有的室内蹦极！</td>\n",
       "      <td>2019-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "      <td>来</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>【旺巴蜀】限时2.99折 ！100+种口味任意撸！人均10元+吃掉一座山！</td>\n",
       "      <td>2019-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "      <td>折</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>人均25＋！一口锅吃下一整头牛 ！现切牛肉片片都是粘人精 ！</td>\n",
       "      <td>2019-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>LV设计总监亲自操刀，万象城的艺术餐厅，超精致 ！</td>\n",
       "      <td>2019-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2 层楼的\"小香港\"，广东人扎堆去，这次终于被我找到了 ！</td>\n",
       "      <td>2019-03-22</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "      <td>香</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7646</th>\n",
       "      <td>立减 688 元 | 【厦门海景千禧大酒店】婚宴春季特惠，火爆预定中 ！</td>\n",
       "      <td>2016-03-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7647</th>\n",
       "      <td>厦门美食地图（二） | 带你发现一个吃货的世外桃源 - 杏林 ！</td>\n",
       "      <td>2016-03-14</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7648</th>\n",
       "      <td>发现厦门 | 一位旗袍美女引发的厦门老百货回忆思潮 ...</td>\n",
       "      <td>2016-03-08</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7649</th>\n",
       "      <td>榴莲控最爱 | 必胜客【泰国金枕榴莲披萨】只要39元 ！</td>\n",
       "      <td>2016-03-07</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7650</th>\n",
       "      <td>发现厦门 | 她在沙坡尾开了六年独立书店，面朝大海，气定神闲 ...</td>\n",
       "      <td>2016-03-03</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>7651 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                      title create_time  \\\n",
       "0       带你飞起来的超HOT健身馆！趣味燃脂减压时尚，厦门绝无仅有的室内蹦极！  2019-03-23   \n",
       "1     【旺巴蜀】限时2.99折 ！100+种口味任意撸！人均10元+吃掉一座山！  2019-03-23   \n",
       "2            人均25＋！一口锅吃下一整头牛 ！现切牛肉片片都是粘人精 ！  2019-03-23   \n",
       "3                 LV设计总监亲自操刀，万象城的艺术餐厅，超精致 ！  2019-03-23   \n",
       "4             2 层楼的\"小香港\"，广东人扎堆去，这次终于被我找到了 ！  2019-03-22   \n",
       "...                                     ...         ...   \n",
       "7646   立减 688 元 | 【厦门海景千禧大酒店】婚宴春季特惠，火爆预定中 ！  2016-03-16   \n",
       "7647       厦门美食地图（二） | 带你发现一个吃货的世外桃源 - 杏林 ！  2016-03-14   \n",
       "7648          发现厦门 | 一位旗袍美女引发的厦门老百货回忆思潮 ...  2016-03-08   \n",
       "7649           榴莲控最爱 | 必胜客【泰国金枕榴莲披萨】只要39元 ！  2016-03-07   \n",
       "7650     发现厦门 | 她在沙坡尾开了六年独立书店，面朝大海，气定神闲 ...  2016-03-03   \n",
       "\n",
       "                                                   link variable  \n",
       "0     http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...        来  \n",
       "1     http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...        折  \n",
       "2     http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...     无法分类  \n",
       "3     http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...     无法分类  \n",
       "4     http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...        香  \n",
       "...                                                 ...      ...  \n",
       "7646  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...     无法分类  \n",
       "7647  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...     无法分类  \n",
       "7648  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...     无法分类  \n",
       "7649  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...     无法分类  \n",
       "7650  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...     无法分类  \n",
       "\n",
       "[7651 rows x 4 columns]"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_o = df_url_out.join(df_cat).replace(\"\", np.nan).fillna(\"无法分类\")\n",
    "df_o"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>variable</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>带你飞起来的超HOT健身馆！趣味燃脂减压时尚，厦门绝无仅有的室内蹦极！</td>\n",
       "      <td>2019-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "      <td>来</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>336元双人酒店西餐免费送 ！厦门探秘圣地，就藏在这片「都会桃源」之中 ！</td>\n",
       "      <td>2019-03-22</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "      <td>免费</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0°C创意铜锅，锅底吃到饱，厦门找不到第二家 ！</td>\n",
       "      <td>2019-03-22</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>厦门版「明星大侦探」！抖音情感剧本，体验爱情的波折心酸 ...</td>\n",
       "      <td>2019-03-21</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "      <td>折</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>带你飞起来的超HOT健身馆！趣味燃脂减压时尚，厦门绝无仅有的室内蹦极！</td>\n",
       "      <td>2019-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "      <td>来</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7645</th>\n",
       "      <td>新店速递 | 英国最富盛名的咖啡品牌 COSTA 进驻厦门磐基 ！</td>\n",
       "      <td>2016-03-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7646</th>\n",
       "      <td>立减 688 元 | 【厦门海景千禧大酒店】婚宴春季特惠，火爆预定中 ！</td>\n",
       "      <td>2016-03-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7647</th>\n",
       "      <td>厦门美食地图（二） | 带你发现一个吃货的世外桃源 - 杏林 ！</td>\n",
       "      <td>2016-03-14</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7648</th>\n",
       "      <td>发现厦门 | 一位旗袍美女引发的厦门老百货回忆思潮 ...</td>\n",
       "      <td>2016-03-08</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7650</th>\n",
       "      <td>发现厦门 | 她在沙坡尾开了六年独立书店，面朝大海，气定神闲 ...</td>\n",
       "      <td>2016-03-03</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1441 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                      title create_time  \\\n",
       "0       带你飞起来的超HOT健身馆！趣味燃脂减压时尚，厦门绝无仅有的室内蹦极！  2019-03-23   \n",
       "5     336元双人酒店西餐免费送 ！厦门探秘圣地，就藏在这片「都会桃源」之中 ！  2019-03-22   \n",
       "6                  0°C创意铜锅，锅底吃到饱，厦门找不到第二家 ！  2019-03-22   \n",
       "12          厦门版「明星大侦探」！抖音情感剧本，体验爱情的波折心酸 ...  2019-03-21   \n",
       "24      带你飞起来的超HOT健身馆！趣味燃脂减压时尚，厦门绝无仅有的室内蹦极！  2019-03-23   \n",
       "...                                     ...         ...   \n",
       "7645      新店速递 | 英国最富盛名的咖啡品牌 COSTA 进驻厦门磐基 ！  2016-03-16   \n",
       "7646   立减 688 元 | 【厦门海景千禧大酒店】婚宴春季特惠，火爆预定中 ！  2016-03-16   \n",
       "7647       厦门美食地图（二） | 带你发现一个吃货的世外桃源 - 杏林 ！  2016-03-14   \n",
       "7648          发现厦门 | 一位旗袍美女引发的厦门老百货回忆思潮 ...  2016-03-08   \n",
       "7650     发现厦门 | 她在沙坡尾开了六年独立书店，面朝大海，气定神闲 ...  2016-03-03   \n",
       "\n",
       "                                                   link variable  \n",
       "0     http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...        来  \n",
       "5     http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...       免费  \n",
       "6     http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...     无法分类  \n",
       "12    http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...        折  \n",
       "24    http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...        来  \n",
       "...                                                 ...      ...  \n",
       "7645  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...     无法分类  \n",
       "7646  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...     无法分类  \n",
       "7647  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...     无法分类  \n",
       "7648  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...     无法分类  \n",
       "7650  http://mp.weixin.qq.com/s?__biz=MjM5MjY2MzEyMA...     无法分类  \n",
       "\n",
       "[1441 rows x 4 columns]"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_o[df_o.title.str.contains(\"厦门\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>variable</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>无法分类</th>\n",
       "      <td>3216</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>来</th>\n",
       "      <td>1566</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>折</th>\n",
       "      <td>1291</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>免费</th>\n",
       "      <td>1257</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>香</th>\n",
       "      <td>313</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>爽</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>中华城</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>出炉</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>料理</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>海鲜</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>清单</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>送</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          title\n",
       "variable       \n",
       "无法分类       3216\n",
       "来          1566\n",
       "折          1291\n",
       "免费         1257\n",
       "香           313\n",
       "爽             2\n",
       "中华城           1\n",
       "出炉            1\n",
       "料理            1\n",
       "海鲜            1\n",
       "清单            1\n",
       "送             1"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_stats = df_o.groupby(by=\"variable\").agg({\"title\":\"count\"}).sort_values(by=\"title\", ascending=False)\n",
    "df_stats"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 输出"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_account.columns.name = \"rel_accounts\"\n",
    "df_o.columns.name = \"url_cat\"\n",
    "df_stats.columns.name = \"stats\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "_df_.columns.name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the xlsxwriter workbook and worksheet objects.  \n",
    "with pd.ExcelWriter(fn[\"output\"][\"公众号_xlsx\"].format(公众号=公众号)) as writer:\n",
    "    workbook  = writer.book\n",
    "\n",
    "    for _df_ in [df_account, df_o, df_stats]:\n",
    "        _df_.to_excel(writer, sheet_name = _df_.columns.name)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
